# Alex Gazmararian
# agazmararian@gmail.com

library(tidyverse)
library(tidylog)
library(here)
library(sf)
library(janitor)

message("Starting DMA data processing...")

# Function to load DMA data from input directory
load_dma_data <- function() {
  dma_path <- here("data", "input", "dma")
  
  message("Looking for DMA data in: ", dma_path)
  
  if (!dir.exists(dma_path)) {
    dir.create(dma_path, recursive = TRUE)
    message("Created directory: ", dma_path)
  }
  
  # Look for various file formats in order of preference
  # Priority 1: Shapefile (what we currently have)
  shp_files <- list.files(dma_path, pattern = "\\.shp$", full.names = TRUE, recursive = TRUE)
  
  # Priority 2: Other spatial formats
  geojson_files <- list.files(dma_path, pattern = "\\.(geojson|json)$", full.names = TRUE, recursive = TRUE)
  all_dirs <- list.dirs(dma_path, full.names = TRUE, recursive = FALSE)
  gdb_dirs <- all_dirs[grepl("\\.gdb$", all_dirs)]
  kml_files <- list.files(dma_path, pattern = "\\.kml$", full.names = TRUE, recursive = TRUE)
  
  # Try loading in order of preference
  if (length(shp_files) > 0) {
    message("Loading DMA data from shapefile: ", basename(shp_files[1]))
    return(st_read(shp_files[1], quiet = TRUE))
  } else if (length(geojson_files) > 0) {
    message("Loading DMA data from GeoJSON/JSON: ", basename(geojson_files[1]))
    return(st_read(geojson_files[1], quiet = TRUE))
  } else if (length(gdb_dirs) > 0) {
    message("Loading DMA data from geodatabase: ", basename(gdb_dirs[1]))
    layers <- st_layers(gdb_dirs[1])
    message("Available layers: ", paste(layers$name, collapse = ", "))
    return(st_read(gdb_dirs[1], layer = layers$name[1], quiet = TRUE))
  } else if (length(kml_files) > 0) {
    message("Loading DMA data from KML: ", basename(kml_files[1]))
    return(st_read(kml_files[1], quiet = TRUE))
  }
  
  # If no data found, provide helpful instructions
  message("\n", paste(rep("=", 50), collapse = ""))
  message("NO DMA DATA FOUND")
  message(paste(rep("=", 50), collapse = ""))
  message("\nNo DMA data found in: ", dma_path)
  message("\nRECOMMENDED DATA SOURCES:")
  message("1. BEST: Download from https://datablends.us/2021/01/14/a-useful-dma-shapefile-for-tableau-and-alteryx/")
  message("   - Comprehensive coverage including Alaska & Hawaii")
  message("   - 211 DMA markets")
  message("2. Alternative: Nielsen data from https://github.com/simzou/nielsen-dma/blob/master/nielsentopo.json")
  message("   - Limited to continental US (182 markets)")
  message("\nPlace downloaded files in: ", dma_path)
  message("Supported formats: .shp, .geojson, .json, .kml, .gdb")
  
  stop("No DMA data found - please download data following instructions above")
}

# Process DMA data
process_dma_data <- function(dma_raw) {
  message("Processing DMA data...")
  
  # Display data structure
  message("Input data structure:")
  message("  - Columns: ", paste(names(dma_raw), collapse = ", "))
  message("  - Rows: ", nrow(dma_raw))
  
  # Ensure it's an sf object
  if (!inherits(dma_raw, "sf")) {
    if ("geometry" %in% names(dma_raw) || any(grepl("geom", names(dma_raw), ignore.case = TRUE))) {
      dma_raw <- st_as_sf(dma_raw)
    } else {
      stop("Data does not contain spatial information")
    }
  }
  
  # Set CRS if missing
  if (is.na(st_crs(dma_raw))) {
    message("No CRS found, assuming WGS84 (EPSG:4326)")
    st_crs(dma_raw) <- 4326
  }
  
  # Transform to WGS84 and clean
  dma_processed <- dma_raw %>%
    st_transform(4326) %>%
    janitor::clean_names() %>%
    
    # Standardize column names based on common DMA field patterns
    rename_with(~ case_when(
      .x %in% c("dma", "dma_code", "market", "key") ~ "dma_code",
      .x %in% c("dma_name", "market_name", "name") ~ "dma_name", 
      .x == "tv_homes" ~ "tv_households",
      .x %in% c("area_sq_mi", "areasqmi") ~ "area_sq_miles",
      .x %in% c("area_sq_km", "areasqkm") ~ "area_sq_km",
      TRUE ~ .x
    )) %>%
    
    # Select and create standardized columns
    {
      # Ensure we have a DMA code
      if (!"dma_code" %in% names(.)) {
        . <- mutate(., dma_code = row_number())
      }
      .
    } %>%
    
    # Create clean variables
    mutate(
      # Standardize DMA code as numeric
      dma_code = as.numeric(dma_code),
      
      # Create DMA name if missing
      dma_name = if (!"dma_name" %in% names(.)) {
        paste("DMA", dma_code)
      } else {
        ifelse(is.na(dma_name) | dma_name == "", paste("DMA", dma_code), dma_name)
      },
      
      # Create market size categories - simplified approach
      market_size = "Unknown"
    ) %>%
    
    # Select final columns
    select(any_of(c("dma_code", "dma_name", "market_size", "tv_households", 
                    "area_sq_miles", "area_sq_km", "geometry"))) %>%
    
    # Remove invalid geometries and problematic DMAs
    filter(
      st_is_valid(geometry),
      !is.na(dma_code),
      dma_code > 0,  # Remove code 0 (often a catch-all "National" DMA)
      !dma_name %in% c("National", "")  # Remove national-level or empty names
    ) %>%
    
    # Sort by DMA code
    arrange(dma_code)
  
  return(dma_processed)
}

# Analyze coverage
analyze_coverage <- function(dma_data) {
  message("\n=== DMA COVERAGE ANALYSIS ===")
  
  # Basic stats
  message("Dataset summary:")
  message("  - Total DMAs: ", nrow(dma_data))
  message("  - CRS: ", st_crs(dma_data)$input)
  
  # Bounding box
  bbox <- st_bbox(dma_data)
  message("  - Coverage: Longitude ", round(bbox[1], 2), "° to ", round(bbox[3], 2), 
          "°, Latitude ", round(bbox[2], 2), "° to ", round(bbox[4], 2), "°")
  
  # Coverage assessment
  has_alaska <- bbox[1] < -140
  has_hawaii <- bbox[2] < 20  
  has_conus <- bbox[1] > -130 && bbox[3] > -65 && bbox[2] > 24 && bbox[4] < 50
  
  message("  - Alaska coverage: ", ifelse(has_alaska, "[YES]", "[NO]"))
  message("  - Hawaii coverage: ", ifelse(has_hawaii, "[YES]", "[NO]"))
  message("  - Continental US: ", ifelse(has_conus, "[YES]", "[NO]"))
  
  # Market size distribution
  if ("market_size" %in% names(dma_data)) {
    size_dist <- dma_data %>%
      st_drop_geometry() %>%
      count(market_size, sort = TRUE)
    message("\nMarket size distribution:")
    for (i in seq_len(nrow(size_dist))) {
      message("  - ", size_dist$market_size[i], ": ", size_dist$n[i], " markets")
    }
  }
  
  # Overall assessment
  if (has_alaska && has_hawaii && has_conus) {
    message("\n[OK] EXCELLENT: Complete US coverage including Alaska and Hawaii")
  } else if (has_conus) {
    message("\n[WARN] GOOD: Continental US coverage, missing Alaska/Hawaii")
  } else {
    message("\n[WARN] LIMITED: Incomplete coverage detected")
  }
}

# Main execution
message("=== DMA DATA PROCESSING ===")

# Output file path
output_file <- here("data", "inter", "dma_processed.rds")

# Check if output already exists
if (file.exists(output_file)) {
  message("[OK] DMA data already exists: ", output_file)
} else {
  # Turn off s2 for simpler geometry operations
  sf_use_s2(FALSE)
  
  # Load and process data
  dma_raw <- load_dma_data()
  dma_processed <- process_dma_data(dma_raw)
  
  # Analyze coverage
  analyze_coverage(dma_processed)
  
  # Save processed data
  saveRDS(dma_processed, output_file)
  message("\n[OK] Saved processed DMA data to: ", output_file)
  
  # Display final structure
  message("\n=== FINAL DATA STRUCTURE ===")
  print(glimpse(dma_processed))
}